{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 13 - Data Science\n",
    "## Data Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0 - Setting up the notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "from datetime import date, timedelta\n",
    "\n",
    "import faker"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1 - Preparing the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the faker to populate the data\n",
    "fake = faker.Faker()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['{\"username\": \"epennington\", \"name\": \"Stephanie Gonzalez\", \"gender\": \"F\", \"email\": \"harmontammy@example.com\", \"age\": 44, \"address\": \"48185 Brittany Green Suite 999\\\\nEast Tanya, KS 25787\"}',\n",
       " '{\"username\": \"joshua61\", \"name\": \"Diana Richards\", \"gender\": \"F\", \"email\": \"ericfischer@example.net\", \"age\": 69, \"address\": \"229 Mcmillan Circles Suite 408\\\\nNorth Teresamouth, NJ 30402\"}',\n",
       " '{\"username\": \"dmoore\", \"name\": \"Erin Rose\", \"gender\": \"F\", \"email\": \"colleenroberts@example.com\", \"age\": 59, \"address\": \"USS Rivera\\\\nFPO AP 29852\"}']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# generate user profiles\n",
    "# simulate user data coming from an API. It is a list\n",
    "# of JSON strings (users).\n",
    "def get_users(no_of_users):\n",
    "    usernames = (\n",
    "        fake.unique.user_name() for i in range(no_of_users)\n",
    "    )\n",
    "    genders = random.choices(\n",
    "        [\"M\", \"F\", \"O\"], weights=[0.43, 0.47, 0.1], k=no_of_users\n",
    "    )\n",
    "    for username, gender in zip(usernames, genders):\n",
    "        user = {\n",
    "            \"username\": username,\n",
    "            \"name\": get_random_name(gender),\n",
    "            \"gender\": gender,\n",
    "            \"email\": fake.email(),\n",
    "            \"age\": fake.random_int(min=18, max=90),\n",
    "            \"address\": fake.address(),\n",
    "        }\n",
    "        yield json.dumps(user)\n",
    "\n",
    "\n",
    "def get_random_name(gender):\n",
    "    match gender:\n",
    "        case \"F\":\n",
    "            name = fake.name_female()\n",
    "        case \"M\":\n",
    "            name = fake.name_male()\n",
    "        case _:\n",
    "            name = fake.name_nonbinary()\n",
    "    return name\n",
    "\n",
    "\n",
    "users = list(get_users(1000))\n",
    "users[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# campaign name format:\n",
    "# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency\n",
    "def get_type():\n",
    "    # just some meaningless example codes\n",
    "    types = [\"AKX\", \"BYU\", \"GRZ\", \"KTR\"]\n",
    "    return random.choice(types)\n",
    "\n",
    "\n",
    "def get_start_end_dates():\n",
    "    duration = random.randint(1, 2 * 365)\n",
    "    offset = random.randint(-365, 365)\n",
    "    start = date.today() - timedelta(days=offset)\n",
    "    end = start + timedelta(days=duration)\n",
    "\n",
    "    def _format_date(date_):\n",
    "        return date_.strftime(\"%Y%m%d\")\n",
    "\n",
    "    return _format_date(start), _format_date(end)\n",
    "\n",
    "\n",
    "def get_age_range():\n",
    "    age = random.randrange(20, 46, 5)\n",
    "    diff = random.randrange(5, 26, 5)\n",
    "    return \"{}-{}\".format(age, age + diff)\n",
    "\n",
    "\n",
    "def get_gender():\n",
    "    return random.choice((\"M\", \"F\", \"A\"))\n",
    "\n",
    "\n",
    "def get_currency():\n",
    "    return random.choice((\"GBP\", \"EUR\", \"USD\"))\n",
    "\n",
    "\n",
    "def get_campaign_name():\n",
    "    separator = \"_\"\n",
    "    type_ = get_type()\n",
    "    start, end = get_start_end_dates()\n",
    "    age_range = get_age_range()\n",
    "    gender = get_gender()\n",
    "    currency = get_currency()\n",
    "    return separator.join(\n",
    "        (type_, start, end, age_range, gender, currency)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# campaign data:\n",
    "# name, budget, spent, clicks, impressions\n",
    "def get_campaign_data():\n",
    "    name = get_campaign_name()\n",
    "    budget = random.randint(10**3, 10**6)\n",
    "    spent = random.randint(10**2, budget)\n",
    "    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))\n",
    "    impressions = int(random.gauss(0.5 * 10**6, 2))\n",
    "    return {\n",
    "        \"cmp_name\": name,\n",
    "        \"cmp_bgt\": budget,\n",
    "        \"cmp_spent\": spent,\n",
    "        \"cmp_clicks\": clicks,\n",
    "        \"cmp_impr\": impressions,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# assemble the logic to get the final version of the rough data\n",
    "# data will be a list of dictionaries. Each dictionary will follow\n",
    "# this structure:\n",
    "# {'user': user_json, 'campaigns': [c1, c2, ...]}\n",
    "# where user_json is the JSON string version of a user data dict\n",
    "# and c1, c2, ... are campaign dicts as returned by\n",
    "# get_campaign_data\n",
    "\n",
    "\n",
    "def get_data(users):\n",
    "    data = []\n",
    "    for user in users:\n",
    "        campaigns = [\n",
    "            get_campaign_data()\n",
    "            for _ in range(random.randint(2, 8))\n",
    "        ]\n",
    "        data.append({\"user\": user, \"campaigns\": campaigns})\n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 - Cleaning the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'user': '{\"username\": \"epennington\", \"name\": \"Stephanie Gonzalez\", \"gender\": \"F\", \"email\": \"harmontammy@example.com\", \"age\": 44, \"address\": \"48185 Brittany Green Suite 999\\\\nEast Tanya, KS 25787\"}',\n",
       "  'campaigns': [{'cmp_name': 'KTR_20250404_20250916_35-50_A_EUR',\n",
       "    'cmp_bgt': 964496,\n",
       "    'cmp_spent': 29586,\n",
       "    'cmp_clicks': 36632,\n",
       "    'cmp_impr': 500001},\n",
       "   {'cmp_name': 'AKX_20240130_20241017_20-25_M_GBP',\n",
       "    'cmp_bgt': 344739,\n",
       "    'cmp_spent': 166010,\n",
       "    'cmp_clicks': 67325,\n",
       "    'cmp_impr': 499999}]},\n",
       " {'user': '{\"username\": \"joshua61\", \"name\": \"Diana Richards\", \"gender\": \"F\", \"email\": \"ericfischer@example.net\", \"age\": 69, \"address\": \"229 Mcmillan Circles Suite 408\\\\nNorth Teresamouth, NJ 30402\"}',\n",
       "  'campaigns': [{'cmp_name': 'BYU_20230828_20250115_25-45_M_GBP',\n",
       "    'cmp_bgt': 177403,\n",
       "    'cmp_spent': 125738,\n",
       "    'cmp_clicks': 29989,\n",
       "    'cmp_impr': 499997},\n",
       "   {'cmp_name': 'AKX_20250216_20261129_45-60_F_USD',\n",
       "    'cmp_bgt': 618256,\n",
       "    'cmp_spent': 75017,\n",
       "    'cmp_clicks': 76301,\n",
       "    'cmp_impr': 500000},\n",
       "   {'cmp_name': 'AKX_20231229_20250721_20-40_F_GBP',\n",
       "    'cmp_bgt': 113805,\n",
       "    'cmp_spent': 12583,\n",
       "    'cmp_clicks': 48915,\n",
       "    'cmp_impr': 500001}]}]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fetch simulated rough data\n",
    "rough_data = get_data(users)\n",
    "\n",
    "rough_data[:2]  # let us take a peek"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'cmp_name': 'KTR_20250404_20250916_35-50_A_EUR',\n",
       "  'cmp_bgt': 964496,\n",
       "  'cmp_spent': 29586,\n",
       "  'cmp_clicks': 36632,\n",
       "  'cmp_impr': 500001,\n",
       "  'user': '{\"username\": \"epennington\", \"name\": \"Stephanie Gonzalez\", \"gender\": \"F\", \"email\": \"harmontammy@example.com\", \"age\": 44, \"address\": \"48185 Brittany Green Suite 999\\\\nEast Tanya, KS 25787\"}'},\n",
       " {'cmp_name': 'AKX_20240130_20241017_20-25_M_GBP',\n",
       "  'cmp_bgt': 344739,\n",
       "  'cmp_spent': 166010,\n",
       "  'cmp_clicks': 67325,\n",
       "  'cmp_impr': 499999,\n",
       "  'user': '{\"username\": \"epennington\", \"name\": \"Stephanie Gonzalez\", \"gender\": \"F\", \"email\": \"harmontammy@example.com\", \"age\": 44, \"address\": \"48185 Brittany Green Suite 999\\\\nEast Tanya, KS 25787\"}'}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's start from having a different version of the data\n",
    "# We want a list whose items will be dicts. Each dict is\n",
    "# the original campaign dict plus the user JSON\n",
    "\n",
    "data = []\n",
    "for datum in rough_data:\n",
    "    for campaign in datum[\"campaigns\"]:\n",
    "        campaign.update({\"user\": datum[\"user\"]})\n",
    "        data.append(campaign)\n",
    "data[:2]  # let us take another peek"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Warning: Uncommenting and executing this cell will overwrite data.json\n",
    "# with open(\"data.json\", \"w\") as stream:\n",
    "#     stream.write(json.dumps(data))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
